{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c3d97644",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2995bfb0d0f4455da0b7a0d756a62da4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4min 4s, sys: 30.7 s, total: 4min 35s\n",
      "Wall time: 2min 17s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
    "\n",
    "from dataclasses import dataclass, field\n",
    "from typing import Optional\n",
    "import contextlib\n",
    "\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "from peft import LoraConfig\n",
    "from transformers import (\n",
    "    AutoModelForCausalLM,\n",
    "    AutoTokenizer,\n",
    "    BitsAndBytesConfig,\n",
    "    HfArgumentParser,\n",
    "    AutoTokenizer,\n",
    "    TrainingArguments,\n",
    ")\n",
    "from peft import (\n",
    "    prepare_model_for_kbit_training,\n",
    "    LoraConfig,\n",
    "    get_peft_model,\n",
    "    PeftModel\n",
    ")\n",
    "\n",
    "model = \"bigcode/octocoder\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    model, quantization_config=None, \n",
    "    device_map=None, \n",
    "    trust_remote_code=True, \n",
    "    torch_dtype=torch.bfloat16,\n",
    ")\n",
    "\n",
    "model_id = \"smangrul/peft-lora-starcoder15B-v2-personal-copilot-A100-40GB-colab\"\n",
    "model = PeftModel.from_pretrained(model, model_id, adapter_name=\"copilot\")\n",
    "\n",
    "\n",
    "if not hasattr(model, \"hf_device_map\"):\n",
    "    model.cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0abf9722",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_code_completion(prefix, suffix, disable=False):\n",
    "    context = contextlib.nullcontext\n",
    "    if disable:\n",
    "        context = model.disable_adapter\n",
    "    text = prompt = f\"\"\"<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>\"\"\"\n",
    "    model.eval()\n",
    "    with context():\n",
    "        outputs = model.generate(input_ids=tokenizer(text, return_tensors=\"pt\").input_ids.cuda(), \n",
    "                                 max_new_tokens=128,\n",
    "                                 temperature=0.2,\n",
    "                                 top_k=50,\n",
    "                                 top_p=0.95,\n",
    "                                 do_sample=True,\n",
    "                                 repetition_penalty=1.0,\n",
    "                                )\n",
    "    return tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "74f0b18e",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.add_weighted_adapter([\"copilot\"], [0.8], \"code_buddy\")\n",
    "model.set_adapter(\"code_buddy\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "884e3289",
   "metadata": {},
   "source": [
    "### Test infilling and code completion ability"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bc159031",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<fim_prefix># coding=utf-8\n",
      "# Copyright 2023-present the HuggingFace Inc. team.\n",
      "#\n",
      "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
      "# you may not use this file except in compliance with the License.\n",
      "# You may obtain a copy of the License at\n",
      "#\n",
      "#     http://www.apache.org/licenses/LICENSE-2.0\n",
      "#\n",
      "# Unless required by applicable law or agreed to in writing, software\n",
      "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
      "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
      "# See the License for the specific language governing permissions and\n",
      "# limitations under the License.\n",
      "import math\n",
      "import re\n",
      "import warnings\n",
      "from dataclasses import asdict, dataclass, field, replace\n",
      "from enum import Enum\n",
      "from typing import List, Optional, Tuple, Union\n",
      "\n",
      "import torch\n",
      "import torch.nn as nn\n",
      "import torch.nn.functional as F\n",
      "from tqdm import tqdm\n",
      "from transformers.pytorch_utils import Conv1D\n",
      "\n",
      "from..config import PeftConfig\n",
      "from..import_utils import is_bnb_4bit_available, is_bnb_available\n",
      "from..utils import (\n",
      "    CLAMP_QUANTILE,\n",
      "    COMMON_LAYERS_PATTERN,\n",
      "    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,\n",
      "    ModulesToSaveWrapper,\n",
      "    PeftType,\n",
      "    _freeze_adapter,\n",
      "    _get_submodules,\n",
      "    transpose,\n",
      ")\n",
      "from.tuners_utils import BaseTuner, BaseTunerLayer\n",
      "\n",
      "\n",
      "if is_bnb_available():\n",
      "    import bitsandbytes as bnb\n",
      "\n",
      "\n",
      "@dataclass\n",
      "class BottleneckAdapterConfig(PeftConfig):\n",
      "    \"\"\"\n",
      "    <fim_suffix>\n",
      "    \"\"\" <fim_middle>Configuration for the [`BottleneckAdapter`].\n",
      "\n",
      "    Args:\n",
      "        in_features (`int`):\n",
      "            The number of input features.\n",
      "        out_features (`int`):\n",
      "            The number of output features.\n",
      "        hidden_features (`int`, *optional*, defaults to 1024):\n",
      "            The number of hidden features.\n",
      "        num_attention_heads (`int`, *optional*, defaults to 16):\n",
      "            The number of attention heads.\n",
      "        hidden_act (`str` or `function`, *optional*, defaults to `\"gelu\"`):\n",
      "            The non-linear activation function (function or string) in the encoder and pooler\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "prefix = \"\"\"# coding=utf-8\n",
    "# Copyright 2023-present the HuggingFace Inc. team.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "import math\n",
    "import re\n",
    "import warnings\n",
    "from dataclasses import asdict, dataclass, field, replace\n",
    "from enum import Enum\n",
    "from typing import List, Optional, Tuple, Union\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from tqdm import tqdm\n",
    "from transformers.pytorch_utils import Conv1D\n",
    "\n",
    "from ..config import PeftConfig\n",
    "from ..import_utils import is_bnb_4bit_available, is_bnb_available\n",
    "from ..utils import (\n",
    "    CLAMP_QUANTILE,\n",
    "    COMMON_LAYERS_PATTERN,\n",
    "    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,\n",
    "    ModulesToSaveWrapper,\n",
    "    PeftType,\n",
    "    _freeze_adapter,\n",
    "    _get_submodules,\n",
    "    transpose,\n",
    ")\n",
    "from .tuners_utils import BaseTuner, BaseTunerLayer\n",
    "\n",
    "\n",
    "if is_bnb_available():\n",
    "    import bitsandbytes as bnb\n",
    "\n",
    "\n",
    "@dataclass\n",
    "class BottleneckAdapterConfig(PeftConfig):\n",
    "    \\\"\"\"\n",
    "    \"\"\"\n",
    "\n",
    "suffix = \"\"\"\n",
    "    \\\"\"\" \\\n",
    "\"\"\"\n",
    "\n",
    "print(get_code_completion(prefix, suffix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3b16362e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<fim_prefix>from peft import LoraConfig, TaskType\n",
      "from transformers import AutoModelForCausalLM\n",
      "\n",
      "peft_config = LoraConfig(<fim_suffix>)<fim_middle>\n",
      "    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1\n",
      ")\n",
      "\n",
      "model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
      "model = get_peft_model(model, peft_config<|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "prefix = \"\"\"\\\n",
    "from peft import LoraConfig, TaskType\n",
    "from transformers import AutoModelForCausalLM\n",
    "\n",
    "peft_config = LoraConfig(\"\"\"\n",
    "\n",
    "suffix = \")\"\n",
    "print(get_code_completion(prefix, suffix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b3ae1034",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<fim_prefix>from accelerate import Accelerator\n",
      "\n",
      "accelerator = Accelerator()\n",
      "\n",
      "model, optimizer, training_dataloader, scheduler = <fim_suffix><fim_middle>accelerator.prepare(\n",
      "    model, optimizer, training_dataloader, scheduler\n",
      ")\n",
      "```\n",
      "\n",
      "## Saving and loading models\n",
      "\n",
      "To save a model, use the [`~Accelerator.save_state`] method:\n",
      "\n",
      "```py\n",
      "accelerator.save_state(model, \"my_model.pt\")\n",
      "```\n",
      "\n",
      "To load a model, use the [`~Accelerator.load_state`] method:\n",
      "\n",
      "```py\n",
      "accelerator.load_state(\"my_model.pt\")\n",
      "```\n",
      "\n",
      "## Sharing models\n",
      "\n",
      "To share a model with other users, use the\n"
     ]
    }
   ],
   "source": [
    "prefix = \"\"\"from accelerate import Accelerator\n",
    "\n",
    "accelerator = Accelerator()\n",
    "\n",
    "model, optimizer, training_dataloader, scheduler = \"\"\"\n",
    "\n",
    "suffix = \"\"\"\"\"\"\n",
    "print(get_code_completion(prefix, suffix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b0446f84",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<fim_prefix>\n",
      "# Here is the correct implementation of the two sum code exercise\n",
      "# time complexity: O(N)\n",
      "# space complexity: O(N)\n",
      "def two_sum(arr, target_sum):\n",
      "<fim_suffix><fim_middle>    # initialize a dictionary to store the values and their indices\n",
      "    value_to_index = {}\n",
      "    for i, value in enumerate(arr):\n",
      "        if value in value_to_index:\n",
      "            return [value_to_index[value], i]\n",
      "        else:\n",
      "            value_to_index[target_sum - value] = i\n",
      "    return []\n",
      "<|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "prefix = \"\"\"\n",
    "# Here is the correct implementation of the two sum code exercise\n",
    "# time complexity: O(N)\n",
    "# space complexity: O(N)\n",
    "def two_sum(arr, target_sum):\n",
    "\"\"\"\n",
    "\n",
    "suffix = \"\"\"\"\"\"\n",
    "print(get_code_completion(prefix, suffix))#, disable=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bf3d92b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_model_pred(query, disable=False):\n",
    "    context = contextlib.nullcontext\n",
    "    if disable:\n",
    "        context = model.disable_adapter\n",
    "    text = prompt = f\"Question: {query}\\n\\nAnswer:\"\n",
    "    model.eval()\n",
    "    with context():\n",
    "        outputs = model.generate(input_ids=tokenizer(text, return_tensors=\"pt\").input_ids.cuda(), \n",
    "                                 max_new_tokens=1024,\n",
    "                                 temperature=0.2,\n",
    "                                 top_k=50,\n",
    "                                 top_p=0.95,\n",
    "                                 do_sample=True,\n",
    "                                 repetition_penalty=1.0,\n",
    "                                 eos_token_id = tokenizer.eos_token_id)\n",
    "    return tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ed4ac5cf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: Write a code snippet for using 🤗 Trainer to finetune `bigcode/starcoder` on dataset `smangrul/hf-stack-v1` for Causal language modeling task using 🤗 PEFT `LoraConfig` with `rank=16` and `alpha=32`.  \n",
      "\n",
      "\n",
      "Answer: Here is a code snippet for using 🤗 Trainer to finetune `bigcode/starcoder` on dataset `smangrul/hf-stack-v1` for Causal language modeling task using 🤗 PEFT `LoraConfig` with `rank=16` and `alpha=32`:\n",
      "\n",
      "```python\n",
      "from transformers import LoraConfig, Trainer\n",
      "from transformers.models.bigcode.modeling_bigcode import BigCodeConfig\n",
      "\n",
      "model_config = BigCodeConfig.from_pretrained(\"bigcode/starcoder\")\n",
      "model_config.peft_config = LoraConfig(rank=16, alpha=32)\n",
      "\n",
      "trainer = Trainer(\n",
      "    model=BigCodeForCausalLM.from_pretrained(\"bigcode/starcoder\", config=model_config),\n",
      "    args=training_args,\n",
      "    train_dataset=train_dataset,\n",
      "    eval_dataset=eval_dataset,\n",
      "    compute_metrics=compute_metrics,\n",
      ")\n",
      "\n",
      "trainer.train()\n",
      "```\n",
      "\n",
      "Note that the `model_config.peft_config` attribute is used to specify the PEFT configuration.<|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "# disabled\n",
    "query = \"\"\"Write a code snippet for using 🤗 Trainer to finetune `bigcode/starcoder` on dataset `smangrul/hf-stack-v1` \\\n",
    "for Causal language modeling task using 🤗 PEFT `LoraConfig` with `rank=16` and `alpha=32`.  \n",
    "\"\"\"\n",
    "print(get_model_pred(query, disable=True))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da69b580",
   "metadata": {},
   "source": [
    "### Test assistance and conversation ability"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f573159c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: Write a code snippet for using 🤗 Trainer to finetune `bigcode/starcoder` on dataset `smangrul/hf-stack-v1` for Causal language modeling task using 🤗 PEFT `LoraConfig` with `rank=16` and `alpha=32`.  \n",
      "\n",
      "\n",
      "Answer: Here is a code snippet for using 🤗 Trainer to finetune `bigcode/starcoder` on dataset `smangrul/hf-stack-v1` for Causal language modeling task using 🤗 PEFT `LoraConfig` with `rank=16` and `alpha=32`:\n",
      "\n",
      "```python\n",
      "from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments\n",
      "from peft import LoraConfig, get_peft_model\n",
      "\n",
      "model_name_or_path = \"bigcode/starcoder\"\n",
      "tokenizer_name_or_path = \"bigcode/starcoder\"\n",
      "dataset_name = \"smangrul/hf-stack-v1\"\n",
      "\n",
      "model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n",
      "tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)\n",
      "\n",
      "peft_config = LoraConfig(\n",
      "    task_type=\"CAUSAL_LM\", inference_mode=False, r=16, lora_alpha=32, bias=\"none\", lora_dropout=0.1\n",
      ")\n",
      "model = get_peft_model(model, peft_config)\n",
      "\n",
      "training_args = TrainingArguments(\n",
      "    output_dir=\"peft_lora_starcoder_stack\",\n",
      "    per_device_train_batch_size=16,\n",
      "    gradient_accumulation_steps=1,\n",
      "    learning_rate=1e-4,\n",
      "    warmup_steps=1000,\n",
      "    num_train_epochs=10,\n",
      "    logging_steps=100,\n",
      "    save_steps=1000,\n",
      "    evaluation_strategy=\"steps\",\n",
      "    eval_steps=1000,\n",
      "    load_best_model_at_end=True,\n",
      "    metric_for_best_model=\"eval_loss\",\n",
      "    push_to_hub=True,\n",
      "    hub_model_id=\"smangrul/peft_lora_starcoder_stack\",\n",
      ")\n",
      "\n",
      "trainer = Trainer(\n",
      "    model=model,\n",
      "    args=training_args,\n",
      "    train_dataset=dataset[\"train\"],\n",
      "    eval_dataset=dataset[\"test\"],\n",
      "    tokenizer=tokenizer,\n",
      ")\n",
      "\n",
      "trainer.train()\n",
      "```\n",
      "\n",
      "Here, we have used the `smangrul/hf-stack-v1` dataset from the 🤗 Datasets library. You can load your own dataset by replacing the `dataset_name` variable with the name of your dataset. You can also specify the `output_dir` and `hub_model_id` variables to save the model and its weights to a local directory or to a model repository on the Hub, respectively.\n",
      "\n",
      "Note that you can also use the 🤗 PEFT `get_peft_model` function to load a pretrained model with PEFT weights. For example, you can load a pretrained model with PEFT weights using the following code:\n",
      "\n",
      "```python\n",
      "from peft import get_peft_model\n",
      "\n",
      "model = get_peft_model(\"smangrul/peft_lora_starcoder_stack\")\n",
      "```\n",
      "\n",
      "This will automatically load the PEFT weights from the Hub and return a model with the same architecture as the original model. You can then use this model for inference or fine-tuning as you would normally do with any other 🤗 Transformers model.<fim_middle># PEFT for Causal Language Modeling with 🤗 Trainer\n",
      "\n",
      "This guide will show you how to use 🤗 Trainer to finetune a pretrained model with PEFT for Causal Language Modeling.\n",
      "\n",
      "## Prerequisites\n",
      "\n",
      "Before you begin, make sure you have all the necessary libraries installed:\n",
      "\n",
      "```bash\n",
      "pip install transformers datasets peft\n",
      "```\n",
      "\n",
      "## Load the dataset\n",
      "\n",
      "The first step is to load the dataset you want to use for training. Here, we will use the `smangrul/hf-stack-v1` dataset from the 🤗 Datasets library. You can load your own dataset by replacing the `dataset_name` variable with the name of your dataset.\n",
      "\n",
      "```python\n",
      "from datasets import load_dataset\n",
      "\n",
      "dataset_name = \"smangrul/hf-stack-v1\"\n",
      "dataset = load_dataset(dataset_name)\n",
      "```\n",
      "\n",
      "## Train the model\n",
      "\n",
      "Next, you can use 🤗 Trainer to finetune the model with PEFT. Here, we will use the `bigcode/starcoder` model from the 🤗 Transformers library and finetune it on the `smangrul/hf-stack-v1` dataset.\n",
      "\n",
      "First, create a `TrainingArguments` object to specify the training hyperparameters.\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"Write a code snippet for using 🤗 Trainer to finetune `bigcode/starcoder` on dataset `smangrul/hf-stack-v1` \\\n",
    "for Causal language modeling task using 🤗 PEFT `LoraConfig` with `rank=16` and `alpha=32`.  \n",
    "\"\"\"\n",
    "print(get_model_pred(query))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e4d1f31f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Question: How to use scrapy? Explain with an example.\n",
      "\n",
      "Answer: Scrapy is a Python library for web scraping. It allows you to extract data from websites and other online sources and save it to a database or a file. Scrapy is a powerful tool for data collection and web scraping, and it can be used in a variety of applications.\n",
      "\n",
      "Here is an example of how to use scrapy to extract data from a website:\n",
      "\n",
      "1. Install scrapy:\n",
      "\n",
      "```\n",
      "pip install scrapy\n",
      "```\n",
      "\n",
      "2. Create a project folder and navigate to it:\n",
      "\n",
      "```\n",
      "mkdir scrapy-tutorial\n",
      "cd scrapy-tutorial\n",
      "```\n",
      "\n",
      "3. Create a scrapy project:\n",
      "\n",
      "```\n",
      "scrapy startproject tutorial\n",
      "```\n",
      "\n",
      "4. Navigate to the scrapy project folder and create a spider:\n",
      "\n",
      "```\n",
      "cd tutorial\n",
      "scrapy genspider example https://www.example.com\n",
      "```\n",
      "\n",
      "5. Open the spider file and add a custom pipeline to extract data:\n",
      "\n",
      "```\n",
      "vim tutorial/spiders/example.py\n",
      "```\n",
      "\n",
      "The pipeline should look like this:\n",
      "\n",
      "```\n",
      "class ExamplePipeline(object):\n",
      "    def process_item(self, item, spider):\n",
      "        return item\n",
      "```\n",
      "\n",
      "6. Run the spider:\n",
      "\n",
      "```\n",
      "scrapy crawl example\n",
      "```\n",
      "\n",
      "7. The results are saved in a JSON file in the project folder. You can also save the results to a database or a file.<|endoftext|>\n"
     ]
    }
   ],
   "source": [
    "query = \"\"\"How to use scrapy? Explain with an example.\"\"\"\n",
    "print(get_model_pred(query))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcdf40fc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
